{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ur8xi4C7S06n"
      },
      "outputs": [],
      "source": [
        "# Copyright 2024 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "JAPoU8Sm5E6e"
      },
      "source": [
        "# RAG Based on Sensitive Data Protection using Faker\n",
        "\n",
        "\n",
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/RAG_Based_on_Sensitive_Data_Protection_using_Faker.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Open in Colab\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fretrieval-augmented-generation%2FRAG_Based_on_Sensitive_Data_Protection_using_Faker.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Open in Colab Enterprise\n",
        "    </a>\n",
        "  </td>    \n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/retrieval-augmented-generation/RAG_Based_on_Sensitive_Data_Protection_using_Faker.ipynb\">\n",
        "      <img src=\"https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32\" alt=\"Vertex AI logo\"><br> Open in Workbench\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/RAG_Based_on_Sensitive_Data_Protection_using_Faker.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
        "    </a>\n",
        "  </td>\n",
        "</table>\n",
        "\n",
        "<div style=\"clear: both;\"></div>\n",
        "\n",
        "<b>Share to:</b>\n",
        "\n",
        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_based_on_sensitive_data_protection_using_faker.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_based_on_sensitive_data_protection_using_faker.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_based_on_sensitive_data_protection_using_faker.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg\" alt=\"X logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_based_on_sensitive_data_protection_using_faker.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/retrieval-augmented-generation/rag_based_on_sensitive_data_protection_using_faker.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
        "</a>            "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "84f0f73a0f76"
      },
      "source": [
        "| | |\n",
        "|-|-|\n",
        "|Author(s) | [Omotayo Aina](https://github.com/ainaomotayo) | "
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tvgnzT1CKxrO"
      },
      "source": [
        "## Overview\n",
        "\n",
        "This notebook shows how to use Cloud Data Loss Prevention (Cloud DLP) which is now a part of Sensitive Data Protection to anonymize PII data, replacing the found PII data with fake data generated by [Faker Library](https://github.com/joke2k/faker).\n",
        "\n",
        "Ideally, only Crypto-based tokenization transformations techniques are reversible but to make replacement transformation reversible, we leverage on Firestore database to hold the original data and mapped it with the fake data generated. \n",
        "\n",
        "In this notebook, you will learn how to implement RAG with Sensitive Data Prevention to comply with your privacy requirements. We create text embeddings for publicly available site from [Vodafone Site](https://www.vodafone.com/about-vodafone/who-we-are/leadership/executive-committee/margherita-della-valle).\n",
        "\n",
        "- [Gemini](https://ai.google.dev/models/gemini) is a family of generative AI models that lets developers generate content and solve problems. These models are designed and trained to handle both text and images as input.\n",
        "\n",
        "- [LangChain](https://www.langchain.com/) is a framework designed to make integration of Large Language Models (LLM) like Gemini easier for applications.\n",
        "\n",
        "- [Chroma](https://python.langchain.com/docs/integrations/vectorstores/chroma) is the open-source embedding database. Chroma makes it easy to build LLM apps by making knowledge, facts, and skills pluggable for LLMs.\n",
        "\n",
        "- [Firestore](https://cloud.google.com/firestore/docs/overview) is a flexible, scalable database for mobile, web, and server development from Firebase and Google Cloud. \n",
        "\n",
        "- [Sensitive Data Protection](https://cloud.google.com/sensitive-data-protection/docs/sensitive-data-protection-overview) provides access to a powerful sensitive data inspection, classification, and de-identification platform.\n",
        "\n",
        "- [Faker](https://github.com/joke2k/faker) is a Python package that generates fake data for you.\n",
        "\n",
        "For more information, see the [Generative AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) on Vertex AI documentation."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "61RBz8LLbxCR"
      },
      "source": [
        "## Getting Started"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "No17Cw5hgx12"
      },
      "source": [
        "### Install Vertex AI SDK and other required packages"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tFy3H3aPgx12"
      },
      "outputs": [],
      "source": [
        "%pip install --upgrade --user --quiet google-cloud-aiplatform google-cloud-firestore Faker google-cloud-dlp langchain-core langchain_google_vertexai chromadb"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "R5Xep4W9lq-Z"
      },
      "source": [
        "### Restart runtime\n",
        "\n",
        "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n",
        "\n",
        "The restart might take a minute or longer. After it's restarted, continue to the next step."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XRvKdaPDTznN"
      },
      "outputs": [],
      "source": [
        "import IPython\n",
        "\n",
        "app = IPython.Application.instance()\n",
        "app.kernel.do_shutdown(True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SbmM4z7FOBpM"
      },
      "source": [
        "<div class=\"alert alert-block alert-warning\">\n",
        "<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>\n",
        "</div>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dmWOrTJ3gx13"
      },
      "source": [
        "### Authenticate your notebook environment (Colab only)\n",
        "\n",
        "If you are running this notebook on Google Colab, run the cell below to authenticate your environment."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NyKGtVQjgx13"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "\n",
        "if \"google.colab\" in sys.modules:\n",
        "    from google.colab import auth\n",
        "\n",
        "    auth.authenticate_user()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DF4l8DTdWgPY"
      },
      "source": [
        "### Set Google Cloud project information and initialize Vertex AI SDK\n",
        "\n",
        "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n",
        "\n",
        "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Nqwi-5ufWp_B"
      },
      "outputs": [],
      "source": [
        "PROJECT_ID = \"[your-project-id]\"  # @param {type:\"string\"}\n",
        "LOCATION = \"us-central1\"  # @param {type:\"string\"}\n",
        "\n",
        "\n",
        "import vertexai\n",
        "\n",
        "vertexai.init(project=PROJECT_ID, location=LOCATION)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "395c0c4de815"
      },
      "source": [
        "### Enable the Firestore and Generative Language API"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ff4bd32c99ab"
      },
      "outputs": [],
      "source": [
        "!gcloud services enable aiplatform.googleapis.com firestore.googleapis.com dlp.googleapis.com --project={PROJECT_ID}"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8b27c495ef75"
      },
      "source": [
        "### This create the default database with Native mode"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5e08769ab86e"
      },
      "outputs": [],
      "source": [
        "!gcloud firestore databases create --project=$PROJECT_ID --location=$LOCATION"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "24b55f68c7cb"
      },
      "source": [
        "## Architecture View\n",
        "![image.png](https://storage.googleapis.com/github-repo/generative-ai/gemini/use-cases/retrieval-augmented-generation/RAG_Based_on_Sensitive_Data_Protection_using_Faker/image.png)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8689d7a2cb49"
      },
      "source": [
        "### Architecture Workflow:\n",
        "\n",
        "1. User Prompt will be anonymized and the original sensitive data will be replaced by fake data generated by Faker Library, both data mapped and stored in Firestore.\n",
        "    \n",
        "2. Embedding is created on the anonymized data (Prompt)\n",
        "  \n",
        "3. Semantic Search is done using the anonymized data (Prompt) on the Vector Database\n",
        "    \n",
        "4. The output is de-anonymized by replacing the fake data with the mapped original data in the Firestore.\n",
        "\n",
        "For better result, it is recommended that you use more efficient data extractor service for better data rendering and use more accuracy sensitive data detector such as Cloud DLP API"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4f1319830b6e"
      },
      "source": [
        "### Import libraries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "06a53613d0ef"
      },
      "outputs": [],
      "source": [
        "from faker import Faker\n",
        "from google.cloud import dlp_v2, firestore\n",
        "from google.cloud.dlp_v2.types import ContentItem, InspectConfig, InspectContentRequest\n",
        "from langchain import PromptTemplate\n",
        "from langchain.docstore.document import Document\n",
        "from langchain.document_loaders import WebBaseLoader\n",
        "from langchain.schema import StrOutputParser\n",
        "from langchain.schema.runnable import RunnablePassthrough\n",
        "from langchain.vectorstores import Chroma\n",
        "\n",
        "# Initialize Firestore client\n",
        "db = firestore.Client()\n",
        "\n",
        "# Initialize Faker\n",
        "fake = Faker()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "81b9b200c5c3"
      },
      "outputs": [],
      "source": [
        "# Create an instance of WebBaseLoader with the URL of the webpage to be loaded\n",
        "loader = WebBaseLoader(\n",
        "    \"https://www.vodafone.com/about-vodafone/who-we-are/leadership/executive-committee/margherita-della-valle\"\n",
        ")\n",
        "\n",
        "# Call the load method of the WebBaseLoader instance to fetch and parse the webpage\n",
        "# The parsed webpage is stored in the 'documents' variable\n",
        "documents = loader.load()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "67e671e719e3"
      },
      "outputs": [],
      "source": [
        "# Print the contents of the 'documents' variable\n",
        "# This variable is expected to contain the parsed webpage data\n",
        "print(documents)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "84c7e8abe51e"
      },
      "outputs": [],
      "source": [
        "# Extract the text from the website data document\n",
        "text_content = documents[0].page_content\n",
        "\n",
        "# The text content before the substrings \"plc.\" is relevant for this tutorial.\n",
        "# You can use Python's `split()` to select the required content.\n",
        "final_text = text_content.split(\"plc.\", 1)[0]\n",
        "\n",
        "# Convert the text to LangChain's `Document` format\n",
        "docs = [Document(page_content=final_text, metadata={\"source\": \"local\"})]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "309183bc224c"
      },
      "outputs": [],
      "source": [
        "# Print the contents of the 'documents' variable\n",
        "# This variable is expected to contain the parsed webpage data\n",
        "print(docs)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7e46de41e8a3"
      },
      "source": [
        "The code defines a class for reversible anonymization. \n",
        "\n",
        "The class uses the Google Cloud Data Loss Prevention (DLP) service to identify sensitive information in text, replaces the sensitive information with fake data generated by the Faker library, and stores the mapping from the original data to the fake data in Google Cloud Firestore. \n",
        "\n",
        "The class also provides a method for de-anonymizing text by replacing the fake data with the original data."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "4b68bcdebc6f"
      },
      "outputs": [],
      "source": [
        "# Import the Google API exceptions module\n",
        "import google.api_core.exceptions as google_exceptions\n",
        "\n",
        "\n",
        "# Define a class for reversible anonymization\n",
        "class ReversibleAnonymizer:\n",
        "    # Initialize the class with a Google Cloud project ID\n",
        "    def __init__(self, project: str):\n",
        "        # Create a client for the Google Cloud Data Loss Prevention (DLP) service\n",
        "        self.dlp: dlp_v2.DlpServiceClient = dlp_v2.DlpServiceClient()\n",
        "        # Create a Faker instance for generating fake data\n",
        "        self.fake: Faker = Faker()\n",
        "        # Create a client for the Google Cloud Firestore service\n",
        "        self.db: firestore.Client = firestore.Client()\n",
        "        # Store the Google Cloud project ID\n",
        "        self.project: str = project\n",
        "\n",
        "    # Define a method for anonymizing text\n",
        "    def anonymize(self, text_to_deidentify: str) -> str | None:\n",
        "        try:\n",
        "            # Define the parent resource for the DLP API call\n",
        "            parent: str = f\"projects/{self.project}/locations/global\"\n",
        "            # Define the types of sensitive information to look for\n",
        "            info_types = [\n",
        "                {\"name\": \"PERSON_NAME\"},\n",
        "                {\"name\": \"PHONE_NUMBER\"},\n",
        "                {\"name\": \"FIRST_NAME\"},\n",
        "                {\"name\": \"LAST_NAME\"},\n",
        "            ]\n",
        "            # Define the configuration for the DLP inspection\n",
        "            inspect_config = InspectConfig(info_types=info_types, include_quote=True)\n",
        "            # Define the item to inspect\n",
        "            item = ContentItem(value=text_to_deidentify)\n",
        "            # Call the DLP API to inspect the text\n",
        "            response = self.dlp.inspect_content(\n",
        "                request=InspectContentRequest(\n",
        "                    parent=parent,\n",
        "                    inspect_config=inspect_config,\n",
        "                    item=item,\n",
        "                )\n",
        "            )\n",
        "\n",
        "            # For each finding, replace the sensitive data with fake data\n",
        "            for finding in response.result.findings:\n",
        "                # Check if the original data is already mapped to fake data\n",
        "                docs = (\n",
        "                    self.db.collection(\"mappings\")\n",
        "                    .where(\"original_data\", \"==\", finding.quote)\n",
        "                    .stream()\n",
        "                )\n",
        "                docs = list(docs)\n",
        "                if docs:\n",
        "                    # If the original data is already mapped, use the existing fake data\n",
        "                    fake_data: str = docs[0].id\n",
        "                else:\n",
        "                    # If the original data is not already mapped, generate new fake data\n",
        "                    if finding.info_type.name == \"PERSON_NAME\":\n",
        "                        fake_data: str = self.fake.name()\n",
        "                    elif finding.info_type.name == \"FIRST_NAME\":\n",
        "                        fake_data: str = self.fake.first_name()\n",
        "                    elif finding.info_type.name == \"LAST_NAME\":\n",
        "                        fake_data: str = self.fake.last_name()\n",
        "                    else:\n",
        "                        fake_data: str = self.fake.phone_number()\n",
        "                    # Store the mapping from the original data to the fake data\n",
        "                    doc_ref: firestore.DocumentReference = self.db.collection(\n",
        "                        \"mappings\"\n",
        "                    ).document(fake_data)\n",
        "                    doc_ref.set({\"original_data\": finding.quote})\n",
        "\n",
        "                # Replace the original data with the fake data in the text\n",
        "                text_to_deidentify = text_to_deidentify.replace(\n",
        "                    finding.quote, fake_data\n",
        "                )\n",
        "\n",
        "            # Return the anonymized text\n",
        "            return text_to_deidentify\n",
        "\n",
        "        except google_exceptions.GoogleAPICallError as e:\n",
        "            # If an error occurs, print the error and return None\n",
        "            print(f\"An error occurred: {e}\")\n",
        "            return None\n",
        "\n",
        "    # Define a method for de-anonymizing text\n",
        "    def deanonymize(self, text: str) -> str | None:\n",
        "        try:\n",
        "            # For each mapping from original data to fake data, replace the fake data with the original data in the text\n",
        "            docs = self.db.collection(\"mappings\").stream()\n",
        "            for doc in docs:\n",
        "                text = text.replace(doc.id, doc.to_dict()[\"original_data\"])\n",
        "            # Return the de-anonymized text\n",
        "            return text\n",
        "        except google_exceptions.GoogleAPICallError as e:\n",
        "            # If an error occurs, print the error and return None\n",
        "            print(f\"An error occurred: {e}\")\n",
        "            return None"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "c3b202dcbe07"
      },
      "source": [
        "Create an instance of the ReversibleAnonymizer class, passing the Google Cloud project ID as an argument."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2406d6c348c7"
      },
      "outputs": [],
      "source": [
        "# This instance, 'anonymizer', can now be used to call the anonymize and deanonymize methods defined in the ReversibleAnonymizer class.\n",
        "anonymizer = ReversibleAnonymizer(PROJECT_ID)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "73b9a2eaf909"
      },
      "outputs": [],
      "source": [
        "# This line of code is using a list comprehension to create a new list, 'anonymized_docs'.\n",
        "# It iterates over each 'doc' in the 'docs' list, and for each 'doc', it calls the 'anonymize' method of the 'anonymizer' instance.\n",
        "# The 'anonymize' method takes the 'page_content' of the 'doc' as an argument and returns the anonymized text.\n",
        "# The result is a new list where each element is the anonymized version of the corresponding 'doc' in the 'docs' list.\n",
        "\n",
        "anonymized_docs = [anonymizer.anonymize(doc.page_content) for doc in docs]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "b5f179be6e7f"
      },
      "outputs": [],
      "source": [
        "# 'anonymized_docs' is a list that contains the anonymized versions of the documents in the 'docs' list.\n",
        "# Each document in 'docs' has been processed by the 'anonymize' method of the 'anonymizer' instance, which replaces sensitive information with fake data.\n",
        "# The 'anonymize' method uses the Google Cloud Data Loss Prevention (DLP) service to identify sensitive information and the Faker library to generate the fake data.\n",
        "\n",
        "anonymized_docs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "2c4f20067f9f"
      },
      "outputs": [],
      "source": [
        "# Convert the anonymized doc text to LangChain's `Document` format\n",
        "anonymized_docs = [\n",
        "    Document(page_content=doc, metadata={\"source\": \"local\"}) for doc in anonymized_docs\n",
        "]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "bd28379fc9d9"
      },
      "outputs": [],
      "source": [
        "anonymized_docs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "38bdc2782f36"
      },
      "outputs": [],
      "source": [
        "from langchain_google_vertexai import VertexAIEmbeddings\n",
        "\n",
        "gemini_embeddings = VertexAIEmbeddings(model_name=\"text-embedding-005\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6a98f15bc7fe"
      },
      "outputs": [],
      "source": [
        "# Save to disk\n",
        "vectorstore = Chroma.from_documents(\n",
        "    documents=anonymized_docs,  # Data\n",
        "    embedding=gemini_embeddings,  # Embedding model\n",
        "    persist_directory=\"./chroma_db4\",  # Directory to save data\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "32cada6715ca"
      },
      "outputs": [],
      "source": [
        "# The 'get' method of the 'vectorstore' instance is being called here.\n",
        "# This method is typically used to retrieve a vector representation of a given input from the vector store.\n",
        "# The exact behavior depends on the implementation of the 'get' method in the 'vectorstore' class.\n",
        "\n",
        "vectorstore.get()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "81edf0e6ec45"
      },
      "outputs": [],
      "source": [
        "# Load from disk\n",
        "vectorstore_disk = Chroma(\n",
        "    persist_directory=\"./chroma_db4\",  # Directory of db\n",
        "    embedding_function=gemini_embeddings,  # Embedding model\n",
        ")\n",
        "# Get the Retriever interface for the store to use later.\n",
        "# When an unstructured query is given to a retriever it will return documents.\n",
        "# Read more about retrievers in the following link.\n",
        "# https://python.langchain.com/docs/modules/data_connection/retrievers/\n",
        "#\n",
        "# Since only 1 document is stored in the Chroma vector store, search_kwargs `k`\n",
        "# is set to 1 to decrease the `k` value of chroma's similarity search from 4 to\n",
        "# 1. If you don't pass this value, you will get a warning.\n",
        "retriever = vectorstore_disk.as_retriever(search_kwargs={\"k\": 1})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "d030d0f976a5"
      },
      "outputs": [],
      "source": [
        "from langchain_google_vertexai import ChatVertexAI\n",
        "\n",
        "llm = ChatVertexAI(model_name=\"gemini-2.0-flash\", temperature=0.7)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f73ec8e50f2e"
      },
      "outputs": [],
      "source": [
        "# Prompt template to query Gemini\n",
        "llm_prompt_template = \"\"\"You are an assistant for question-answering tasks.\n",
        "Use the following context to answer the question.\n",
        "If you don't know the answer, just say that you don't know.\n",
        "Use five sentences maximum and keep the answer concise.\\n\n",
        "Question: {question} \\nContext: {context} \\nAnswer:\"\"\"\n",
        "\n",
        "llm_prompt = PromptTemplate.from_template(llm_prompt_template)\n",
        "\n",
        "print(llm_prompt)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f5d3b3c9a030"
      },
      "outputs": [],
      "source": [
        "# Combine data from documents to readable string format.\n",
        "\n",
        "\n",
        "def format_docs(anonymized_docs):\n",
        "    return \"\\n\\n\".join(doc.page_content for doc in anonymized_docs)\n",
        "\n",
        "\n",
        "# Create stuff documents chain using LCEL.\n",
        "#\n",
        "# This is called a chain because you are chaining together different elements\n",
        "# with the LLM. In the following example, to create the stuff chain, you will\n",
        "# combine the relevant context from the website data matching the question, the\n",
        "# LLM model, and the output parser together like a chain using LCEL.\n",
        "#\n",
        "# The chain implements the following pipeline:\n",
        "# 1. Extract the website data relevant to the question from the Chroma\n",
        "#    vector store and save it to the variable `context`.\n",
        "# 2. `RunnablePassthrough` option to provide `question` when invoking\n",
        "#    the chain.\n",
        "# 3. The `context` and `question` are then passed to the prompt where they\n",
        "#    are populated in the respective variables.\n",
        "# 4. This prompt is then passed to the LLM (`gemini-2.0-flash`).\n",
        "# 5. Output from the LLM is passed through an output parser\n",
        "#    to structure the model's response.\n",
        "rag_chain = (\n",
        "    {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
        "    | llm_prompt\n",
        "    | llm\n",
        "    | StrOutputParser()\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "d07a036e55b8"
      },
      "outputs": [],
      "source": [
        "# This method is typically used to ask a question to the RAG (Retrieval-Augmented Generation) model containing Embeddings created with anonymized PII data.\n",
        "# The RAG model is a type of question-answering model that retrieves relevant documents from a corpus and then generates an answer based on those documents.\n",
        "# The result of this method call will be the answer to the question as generated by the RAG model before De-anonymization - This is not what we want.\n",
        "\n",
        "rag_chain.invoke(\"Who is the CEO of Vodafone Group?\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6d43a3b1b7ad"
      },
      "outputs": [],
      "source": [
        "# Before execution of this 'invoke' method an anonymize method will be called on anonymizer instance to efficiently use the RAG\n",
        "# After which the result of this method call will be the answer to the question as generated by the RAG model\n",
        "# Then it will go through de-anonymization by replacing the fake data with orignal data - This is what we want to achieve.\n",
        "\n",
        "anonymized_text = anonymizer.anonymize(\"Who is the CEO of Vodafone Group?\")\n",
        "\n",
        "# Invoke the rag_chain with anonymized text\n",
        "response = rag_chain.invoke(anonymized_text)\n",
        "\n",
        "# Deanonymize the response\n",
        "anonymizer.deanonymize(response)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "RAG_Based_on_Sensitive_Data_Protection_using_Faker.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
